set.seed(0)
suppressMessages(library(dplyr))
sim1a <- tibble(
x = rep(1:10, each = 3),
y = x * 1.5 + 6 + rt(length(x), df = 2)
)
suppressMessages(library(pander))
sim1a.model <- lm(y ~ x, sim1a)
pander(sim1a.model)
| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| (Intercept) | 7.727 | 2.228 | 3.468 | 0.001712 |
| x | 1.12 | 0.3591 | 3.118 | 0.004188 |
suppressMessages(library(ggplot2))
suppressMessages(library(plotly))
sim1a.df <- as.data.frame(sim1a)
sim1a <- cbind(sim1a, id = 'a')
plot1.gg <- ggplot(aes(x = x, y = y), data = sim1a.df) + geom_point() + geom_line(stat ="smooth", method = "lm")
ggplotly(plot1.gg)
simfunction <- function() {
as.data.frame(
tibble(
x = rep(1:10, each = 3),
y = x * 1.5 + 6 + rt(length(x), df = 2)
)
)
}
sims.df <- sim1a
for(letter in letters[-1]) {
sims.df <- rbind(sims.df, cbind(simfunction(),id = letter))
}
sims.gg <- ggplot(aes(x = x, y = y, group = id, color = id), data = sims.df) +
geom_point() +
geom_line(stat ="smooth", method = "lm")
ggplotly(sims.gg)
sims.gg +
facet_wrap(~id, ncol = 4, scales = 'free_y')
Expected Y intercept : 6 Expected slope : 1.5
models <- plyr::dlply(sims.df, "id", function(df) lm(y ~ x, data = df))
pander(models)
a:
| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| (Intercept) | 7.727 | 2.228 | 3.468 | 0.001712 |
| x | 1.12 | 0.3591 | 3.118 | 0.004188 |
b:
| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| (Intercept) | 4.696 | 1.114 | 4.217 | 0.0002341 |
| x | 1.678 | 0.1795 | 9.351 | 4.149e-10 |
c:
| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| (Intercept) | 3.105 | 3.22 | 0.9642 | 0.3432 |
| x | 1.827 | 0.5189 | 3.52 | 0.001497 |
d:
| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| (Intercept) | 5.384 | 0.7209 | 7.469 | 3.906e-08 |
| x | 1.644 | 0.1162 | 14.15 | 2.778e-14 |
e:
| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| (Intercept) | 7.006 | 0.8601 | 8.146 | 7.218e-09 |
| x | 1.317 | 0.1386 | 9.502 | 2.937e-10 |
f:
| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| (Intercept) | 7.078 | 0.7166 | 9.877 | 1.263e-10 |
| x | 1.308 | 0.1155 | 11.32 | 5.765e-12 |
g:
| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| (Intercept) | 7.44 | 2.132 | 3.49 | 0.001619 |
| x | 1.275 | 0.3436 | 3.712 | 0.0009049 |
h:
| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| (Intercept) | 7.349 | 3.646 | 2.016 | 0.05351 |
| x | 0.9906 | 0.5876 | 1.686 | 0.1029 |
i:
| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| (Intercept) | 6.559 | 0.9096 | 7.21 | 7.558e-08 |
| x | 1.496 | 0.1466 | 10.2 | 6.17e-11 |
j:
| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| (Intercept) | 6.112 | 0.6171 | 9.904 | 1.19e-10 |
| x | 1.52 | 0.09945 | 15.29 | 4.066e-15 |
k:
| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| (Intercept) | 6.778 | 0.936 | 7.241 | 6.978e-08 |
| x | 1.484 | 0.1509 | 9.836 | 1.385e-10 |
l:
| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| (Intercept) | 7.01 | 0.9607 | 7.297 | 6.056e-08 |
| x | 1.261 | 0.1548 | 8.144 | 7.258e-09 |
m:
| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| (Intercept) | 8.44 | 1.185 | 7.119 | 9.548e-08 |
| x | 0.9925 | 0.1911 | 5.195 | 1.627e-05 |
n:
| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| (Intercept) | 6.892 | 0.6004 | 11.48 | 4.211e-12 |
| x | 1.413 | 0.09676 | 14.6 | 1.278e-14 |
o:
| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| (Intercept) | 6.365 | 0.8207 | 7.755 | 1.899e-08 |
| x | 1.422 | 0.1323 | 10.75 | 1.893e-11 |
p:
| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| (Intercept) | 5.96 | 0.555 | 10.74 | 1.948e-11 |
| x | 1.552 | 0.08944 | 17.36 | 1.609e-16 |
q:
| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| (Intercept) | 5.546 | 0.8886 | 6.242 | 9.575e-07 |
| x | 1.47 | 0.1432 | 10.26 | 5.389e-11 |
r:
| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| (Intercept) | 6.446 | 0.7161 | 9.002 | 9.288e-10 |
| x | 1.427 | 0.1154 | 12.37 | 7.316e-13 |
s:
| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| (Intercept) | 6.608 | 0.6755 | 9.782 | 1.564e-10 |
| x | 1.417 | 0.1089 | 13.02 | 2.13e-13 |
t:
| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| (Intercept) | 7.08 | 1.097 | 6.451 | 5.482e-07 |
| x | 1.333 | 0.1769 | 7.534 | 3.311e-08 |
u:
| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| (Intercept) | 5.741 | 0.5958 | 9.636 | 2.169e-10 |
| x | 1.583 | 0.09602 | 16.49 | 5.983e-16 |
v:
| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| (Intercept) | 5.875 | 0.7642 | 7.688 | 2.249e-08 |
| x | 1.46 | 0.1232 | 11.86 | 1.978e-12 |
w:
| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| (Intercept) | 9.64 | 1.54 | 6.258 | 9.177e-07 |
| x | 0.9224 | 0.2483 | 3.715 | 0.0008962 |
x:
| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| (Intercept) | 5.825 | 0.4988 | 11.68 | 2.825e-12 |
| x | 1.5 | 0.08039 | 18.66 | 2.481e-17 |
y:
| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| (Intercept) | 6.835 | 0.7116 | 9.605 | 2.326e-10 |
| x | 1.371 | 0.1147 | 11.95 | 1.638e-12 |
z:
| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| (Intercept) | 5.767 | 0.8817 | 6.541 | 4.327e-07 |
| x | 1.666 | 0.1421 | 11.72 | 2.573e-12 |
Some of the models get prety close but when there are one or two values on the high or low end that are extremely off it throws the slope of the linear model completely off. This is also partially due to the fact that there is only 30 samples so a few extremes throw off the squared values that are used to minimize the linear model.
make_prediction <- function(a, data) {
data$x * a[1] + a[2]
}
measure_distance <- function(mod, data) {
diff <- data$y - make_prediction(mod, data)
mean(abs(diff))
}
pander(optim(c(0,0), measure_distance, data = sim1a))
counts:
| function | gradient |
|---|---|
| 137 | NA |
message:
Optim found that an a[1] of 1.51 and an a[2] of 6.027 on sim1a where lm(y~x) found an a[1] of 1.12 and an a[2] of 7.27 on the same input data.
abs.model <- plyr::dlply(sims.df, "id", function(df) optim(c(0,0), measure_distance, data = df))
pars <- abs.model[1]$par
for(model in abs.model) {
pars <- rbind(pars, model$par)
}
pars.lm <- data.frame("lm.m" = double(), "lm.b" = double(), stringsAsFactors = FALSE)
for(model in models) {
pars.lm <- rbind(pars.lm, model$coefficients)
}
colnames(pars) <- c("optim.m","optim.b")
colnames(pars.lm) <- c("lm.b","lm.m")
pars.all <- cbind(pars, pars.lm, letters)
ggplot(aes(x = as.factor(letters)), data = pars.all) +
geom_point(aes(y = optim.m, color = 'optim.m')) +
geom_point(aes(y = lm.m, color = 'lm.m')) +
geom_point(aes(y = optim.b, color = 'optim.b')) +
geom_point(aes(y = lm.b, color = 'lm.b')) +
theme(legend.position = 'top') +
ylab("Predicted Value") + xlab("Simulation Number")
ggplot(data = pars.all) +
geom_boxplot(aes(x = 'optim', y = optim.m, color = 'optim.m')) +
geom_boxplot(aes(x = 'lm', y = lm.m, color = 'lm.m')) +
geom_boxplot(aes(x = 'optim', y = optim.b, color = 'optim.b')) +
geom_boxplot(aes(x = 'lm', y = lm.b, color = 'lm.b')) +
theme(legend.position = 'top') +
ylab("Predicted Value") + xlab("Prediction Model")
Optim() with the fitted model vs the linear model also has its own innacuracies. However as shown from the box plots its calculated values of m and b for the Optim() models much more tigtly wrap the expected values than the lm() predictions
23.3.3 (#1, #3, #4)
lm() to fit a straight line, you can use loess() to fit a smooth curve. Repeat the process of model fitting, grid generation, predictions, and visualisation on sim1 using loess() instead of lm(). How does the result compare to geom_smooth()?gg.geom_line <- ggplot(aes(x=x,y=y), data = sim1a) +
geom_line(aes(color = "geom_line, smooth, lm"), stat="smooth", method = "lm", alpha = .5) +
geom_line(aes(color = "geom_line, smooth, loess"), stat="smooth", method = "loess", alpha = .5) +
geom_point() +
theme(legend.position = 'none')
gg.geom_smooth <- ggplot(aes(x=x,y=y), data = sim1a) +
geom_smooth(method='loess') +
geom_point()
suppressMessages(library(gridExtra))
grid.arrange(gg.geom_line,gg.geom_smooth, ncol = 2)
As far as I can tell the visualization of the line done by geom_line() with method = "loess" and stat = "smooth" is identical to geom_smooth() with the exception being that by default geom_smooth() shows a confidence interval around the fitted line, which can be hidden by adding se = FALSE to geom_smooth().
summary(loess(y~x, sim1a))
## Call:
## loess(formula = y ~ x, data = sim1a)
##
## Number of Observations: 30
## Equivalent Number of Parameters: 4.19
## Residual Standard Error: 5.907
## Trace of smoother matrix: 4.58 (exact)
##
## Control settings:
## span : 0.75
## degree : 2
## family : gaussian
## surface : interpolate cell = 0.2
## normalize: TRUE
## parametric: FALSE
## drop.square: FALSE
sims.gg.loess <- ggplot(aes(x = x, y = y, group = id, color = id), data = sims.df) +
geom_point() +
geom_line(stat ="smooth", method = "loess")
ggplotly(sims.gg.loess)
ggplotly(sims.gg.loess +
facet_wrap(~id, ncol = 2, scales = 'free_y'))
For whatever reason my y sim isn’t showing up so here it is.
y.gg <- sims.df %>%
filter(id == 'y') %>%
ggplot(aes(x = x, y = y, color = id)) +
geom_point() +
geom_line(stat = "smooth", method = "loess")
ggplotly(y.gg)
geom_ref_line() do? What package does it come from? Why is displaying a reference line in plots showing residuals useful and important?
geom_ref_line() comes from the modelr package. It is used to add a reference line to a ggplot2 plot. It can be useful for putting a line through a value of interest to give a visual reference to how far points are from an expected value. This is especially useful when the x axis is non numeric.
A frequency poygon of the absolute residuals might give you a better idea of where the true values that your model is predicting might lie. In theory given a large enough sample size the frequency polygon should be most dense by the true value that your model is looking for. It can also make it quite visually obvious if there are any glaring issues with your model. A huge frequency not near zero means that something is wrong in your model or data that you are probably not accounting because the residuals should be especially dense around zero.
suppressMessages(library(modelr))
sim1a.res <- sim1a %>% add_residuals(sim1a.model)
ggplot(aes(resid), data = sim1a.res) + geom_freqpoly(binwidth = .5)
In this case there is definately a datapoint in sim1a that is way off of what is expected by the simple linear model. But for the most part the lm() did a pretty good job with its fit.
23.4.5 (#1, #3, #4)
ggplot(sim2) + geom_point(aes(x = x, y = y))
mod <- lm(y ~ x, data = sim2)
pander(mod)
| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| (Intercept) | 1.152 | 0.3475 | 3.316 | 0.002095 |
| xb | 6.964 | 0.4914 | 14.17 | 2.684e-16 |
| xc | 4.975 | 0.4914 | 10.12 | 4.469e-12 |
| xd | 0.7588 | 0.4914 | 1.544 | 0.1313 |
grid <- sim2 %>%
data_grid(x) %>%
add_predictions(mod)
pander(grid)
| x | pred |
|---|---|
| a | 1.152 |
| b | 8.116 |
| c | 6.127 |
| d | 1.911 |
ggplot(sim2, aes(x)) +
geom_point(aes(y = y)) +
geom_point(data = grid, aes(y = pred), colour = "red", size = 4)
mod2 <- lm(y ~ 0 + x, data = sim2)
pander(mod2)
| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| xa | 1.152 | 0.3475 | 3.316 | 0.002095 |
| xb | 8.116 | 0.3475 | 23.36 | 2.428e-23 |
| xc | 6.127 | 0.3475 | 17.63 | 2.709e-19 |
| xd | 1.911 | 0.3475 | 5.499 | 3.245e-06 |
grid2 <- sim2 %>%
data_grid(x) %>%
add_predictions(mod2)
pander(grid2)
| x | pred |
|---|---|
| a | 1.152 |
| b | 8.116 |
| c | 6.127 |
| d | 1.911 |
ggplot(sim2, aes(x)) +
geom_point(aes(y = y)) +
geom_point(data = grid2, aes(y = pred), colour = "red", size = 4)
Without the intercept the model of the equation is on xa xb xc and xd instead of intercept and x{b,c,d} however the t values are much larger and the Pr(>|t|) or levels of signifigance is much much smaller which is better.
mod1 <- lm(y ~ x1 + x2, data = sim3)
mod1.func <- function(a,b,c,data) {
lm(data[[a]] ~ data[[b]] + data[[c]], data = data)
}
mod1.f <- mod1.func(4,1,2,sim3)
pander(mod1)
| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| (Intercept) | 1.872 | 0.3874 | 4.832 | 4.218e-06 |
| x1 | -0.1967 | 0.04871 | -4.039 | 9.718e-05 |
| x2b | 2.888 | 0.3957 | 7.298 | 4.066e-11 |
| x2c | 4.806 | 0.3957 | 12.14 | 2.42e-22 |
| x2d | 2.36 | 0.3957 | 5.963 | 2.787e-08 |
pander(mod1.f)
| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| (Intercept) | 1.872 | 0.3874 | 4.832 | 4.218e-06 |
| data[[b]] | -0.1967 | 0.04871 | -4.039 | 9.718e-05 |
| data[[c]]b | 2.888 | 0.3957 | 7.298 | 4.066e-11 |
| data[[c]]c | 4.806 | 0.3957 | 12.14 | 2.42e-22 |
| data[[c]]d | 2.36 | 0.3957 | 5.963 | 2.787e-08 |
mod2 <- lm(y ~ x1 * x2, data = sim3)
mod2.func <- function(a,b,c,data) {
lm(data[[a]] ~ data[[b]] * data[[c]], data = data)
}
mod2.f <- mod2.func(4,1,2,sim3)
pander(mod2)
| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| (Intercept) | 1.301 | 0.404 | 3.221 | 0.001673 |
| x1 | -0.09302 | 0.06511 | -1.429 | 0.1559 |
| x2b | 7.069 | 0.5713 | 12.37 | 1.092e-22 |
| x2c | 4.431 | 0.5713 | 7.755 | 4.407e-12 |
| x2d | 0.8346 | 0.5713 | 1.461 | 0.1469 |
| x1:x2b | -0.7603 | 0.09208 | -8.257 | 3.301e-13 |
| x1:x2c | 0.06815 | 0.09208 | 0.7402 | 0.4608 |
| x1:x2d | 0.2773 | 0.09208 | 3.011 | 0.003216 |
pander(mod2.f)
| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| (Intercept) | 1.301 | 0.404 | 3.221 | 0.001673 |
| data[[b]] | -0.09302 | 0.06511 | -1.429 | 0.1559 |
| data[[c]]b | 7.069 | 0.5713 | 12.37 | 1.092e-22 |
| data[[c]]c | 4.431 | 0.5713 | 7.755 | 4.407e-12 |
| data[[c]]d | 0.8346 | 0.5713 | 1.461 | 0.1469 |
| data[[b]]:data[[c]]b | -0.7603 | 0.09208 | -8.257 | 3.301e-13 |
| data[[b]]:data[[c]]c | 0.06815 | 0.09208 | 0.7402 | 0.4608 |
| data[[b]]:data[[c]]d | 0.2773 | 0.09208 | 3.011 | 0.003216 |
mod1 <- lm(y ~ x1 + x2, data = sim4)
mod2 <- lm(y ~ x1 * x2, data = sim4)
pander(mod1)
| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| (Intercept) | 0.03546 | 0.1218 | 0.291 | 0.7712 |
| x1 | 1.822 | 0.1909 | 9.543 | 5.326e-19 |
| x2 | -2.783 | 0.1909 | -14.58 | 1.12e-36 |
pander(mod2)
| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| (Intercept) | 0.03546 | 0.1199 | 0.2956 | 0.7677 |
| x1 | 1.822 | 0.1879 | 9.694 | 1.773e-19 |
| x2 | -2.783 | 0.1879 | -14.81 | 1.668e-37 |
| x1:x2 | 0.9523 | 0.2944 | 3.235 | 0.001356 |
grid <- sim4 %>%
data_grid(
x1 = seq_range(x1, 10),
x2 = seq_range(x2, 10)
) %>%
gather_predictions(mod1, mod2)
ggplotly(ggplot(grid, aes(x1, x2)) +
geom_tile(aes(fill = pred)) +
facet_wrap(~ model, ncol = 1))
ggplotly(ggplot(aes(x = x1, y = x2),data = sim4) + geom_point(aes(color = y, size = 2)))
Its kind of hard to tell but mod2 does a better job as its just a tad bit darker in its graph in the upper left quartile which can be seen in the geom_point plot above. This is a little bit hard to tell purely visually but thanks to plotly::ggplotly() You can see the higher predictions in that upper quartile in mod2’s plot.